# This is code to replicate the analyses and figures from the paper
# "A One Health framework for exploring the zoonotic web: a case study".

# Code developed by A. Desvars-Larrive and G. A. Puspitarani


#~#~#~#~#~#~#~#~#~#~#~##~#~#~#~#~#~#~#~#~#~#~##~#~#~#~#~#~#~#~#~#~#~#
#####                    Requirements                          ######
#~#~#~#~#~#~#~#~#~#~#~##~#~#~#~#~#~#~#~#~#~#~##~#~#~#~#~#~#~#~#~#~#~#

my_packages <- c("ggplot2", "dplyr", "patchwork","stringr","ggfittext","tidyr", "caret", "matrixStats", "plyr", "egg",
                 "RColorBrewer", "igraph", "ggrepel", "plyr", "ggraph", "bipartite", "tidyverse", "rsetse", "hrbrthemes", 
                 "lubridate", "forcats", "CINNA", "bc3net", "purrr", "scales", "ggalluvial", "cowplot")

# Extract not installed packages
not_installed <- my_packages[!(my_packages %in% installed.packages()[ , "Package"])]  
# Install not installed packages
if(length(not_installed)) install.packages(not_installed) 

# Import libraries
library(ggplot2)
library(dplyr)
library(stringr)
library(ggfittext)
library(tidyr)
library(scales)
library(ggalluvial)
library(hrbrthemes)
library(igraph)
library(ggraph)
library(ggrepel)
library(patchwork) 
library(plyr)
library(matrixStats)
library(remotes)
library(bipartite)
library(tidyverse)
library(caret)
library(rsetse)
library(egg)
library(lubridate)
library(forcats)
library(CINNA)
library(bc3net)
library(purrr)
library(RColorBrewer)
library(cowplot)

# Set working directory to data location
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

#~#~#~#~#~#~#~#~#~#~#~##~#~#~#~#~#~#~#~#~#~#~##~#~#~#~#~#~#~#~#~#~#~#
########    Import data and additional cleaning             #########
#~#~#~#~#~#~#~#~#~#~#~##~#~#~#~#~#~#~#~#~#~#~##~#~#~#~#~#~#~#~#~#~#~#

# Data collected
table_final_import <- read.csv("table_final.csv", encoding="UTF-8")
# Need extra cleaning
table_final <- table_final_import %>%
  dplyr::mutate(genus_pathog = case_when(ncbi_pathog == "Mycobacterium chelonae" ~ "Mycobacterium",
                                         TRUE ~ genus_pathog)) %>%
  dplyr::mutate(genus_pathog = case_when(ncbi_pathog == "Ancylostoma tubaeformae" ~ "Ancylostoma",
                                         TRUE ~ genus_pathog)) %>%
  dplyr::mutate(genus_pathog = case_when(ncbi_pathog == "Escherichia coli ESBL" ~ "Escherichia",
                                         TRUE ~ genus_pathog)) %>%
  dplyr::mutate(genus_pathog = case_when(ncbi_pathog == "Toxoascaris leonina" ~ "Toxascaris",
                                         TRUE ~ genus_pathog)) %>%
  dplyr::mutate(family_pathog = case_when(ncbi_pathog == "Mycobacterium chelonae" ~ "Mycobacteriaceae",
                                          TRUE ~ family_pathog)) %>%
  dplyr::mutate(order_pathog = case_when(ncbi_pathog == "Mycobacterium chelonae" ~ "Mycobacteriales",
                                         TRUE ~ order_pathog)) %>%
  dplyr::mutate(family_pathog = case_when(ncbi_pathog == "Ancylostoma tubaeformae" ~ "Ancylostomatidae",
                                          TRUE ~ family_pathog)) %>%
  dplyr::mutate(order_pathog = case_when(ncbi_pathog == "Ancylostoma tubaeformae" ~ "Strongylida",
                                         TRUE ~ order_pathog)) %>%
  dplyr::mutate(family_pathog = case_when(ncbi_pathog == "Escherichia coli ESBL" ~ "Enterobacteriaceae",
                                          TRUE ~ family_pathog)) %>%
  dplyr::mutate(order_pathog = case_when(ncbi_pathog == "Escherichia coli ESBL" ~ "Enterobacterales",
                                         TRUE ~ order_pathog)) %>%
  dplyr::mutate(family_pathog = case_when(ncbi_pathog == "Toxoascaris leonina" ~ "Ascarididae",
                                          TRUE ~ family_pathog)) %>%
  dplyr::mutate(order_pathog = case_when(ncbi_pathog == "Toxoascaris leonina" ~ "Rhabditida",
                                         TRUE ~ order_pathog)) %>%
  dplyr::mutate(family_pathog = case_when(ncbi_pathog == "Borna disease virus" ~ "Bornaviridae",
                                          TRUE ~ family_pathog)) %>%
  dplyr::mutate(order_pathog = case_when(ncbi_pathog == "Borna disease virus" ~ "Mononegavirales",
                                         TRUE ~ order_pathog)) %>%
  dplyr::mutate(family_pathog = case_when(ncbi_pathog == "Parainfluenza virus 4 " ~ "Paramyxoviridae",
                                          TRUE ~ family_pathog)) %>%
  dplyr::mutate(order_pathog = case_when(ncbi_pathog == "Parainfluenza virus 4 " ~ "Mononegavirales",
                                         TRUE ~ order_pathog)) %>%
  dplyr::mutate(superkingdom_pathog = case_when(ncbi_pathog == "Parainfluenza virus 4 " ~ "Viruses",
                                         TRUE ~ superkingdom_pathog )) %>%
  dplyr::mutate(family_pathog = case_when(ncbi_pathog == "Coronavirus" ~ "Coronaviridae",
                                          TRUE ~ family_pathog)) %>%
  dplyr::mutate(order_pathog = case_when(ncbi_pathog == "Coronavirus" ~ "Nidovirales",
                                         TRUE ~ order_pathog))%>%
  dplyr::mutate(order_pathog = case_when(ncbi_pathog == "Herpesvirus" ~ "Herpesvirales",
                                         TRUE ~ order_pathog)) %>%
  ## the common name "wild boar" is lost because pig and wild boar have the same scientific name. we need to correct for this:
  dplyr::mutate(ncbi_host_comm = case_when(ncbi_host_sci == "Sus scrofa" & name_host == "wild boar" ~ name_host,
                                           TRUE ~ ncbi_host_comm )) %>% 
  # differentiate also scientific name for the network analysis
  dplyr::mutate(ncbi_host_sci = case_when(ncbi_host_sci == "Sus scrofa" & name_host == "wild boar" ~ "Sus scrofa (w)",
                                          TRUE ~ ncbi_host_sci )) %>%
  # Change Orthobunyavirus tahynaense to Tahyna virus
  dplyr::mutate(ncbi_pathog = case_when(ncbi_pathog == "Orthobunyavirus tahynaense" ~ "Tahyna virus",
                                          TRUE ~ ncbi_pathog))

# Import data on food categories
categories_food <- read.csv("categories_food.csv", encoding="latin9")

# Pubmed data
PubMed_Timeline_Results_by_Year <- read.csv("PubMed_Timeline_Results_by_Year.csv", encoding = "UTF-8")

#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~
####################### Research trends  ###########################
#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~

# TRENDS  - ALL SAMPLES INVESTIGATED (POSITIVE AND NEGATIVE)
# Will show the intensity of the scientific interest per compartment

# Number of unique studies
nb.studies <- length(unique(table_final$ref_id))
nb.studies # 246

# Type of publication (number and percentage of all publications used)
type_publi <- table_final %>%
  dplyr::select(ref_id, type_publication) %>%
  dplyr::distinct() %>%
  dplyr::group_by(type_publication) %>% 
  dplyr::summarize(count = n())%>%
  dplyr::mutate(percent = round(count/sum(count)*100,1))
type_publi 

# Years of publication
years <- table_final %>%
  dplyr::select (ref_id, year_published) %>%
  distinct() %>% #remove duplicated ID from the dataset
  dplyr::group_by(year_published) %>%
  tally()

### Percentage increase between 2 half periods
nb_year_serie <- 2022-1975 #47 years

half_1 <- years %>%
  dplyr::filter(year_published < max(years$year_published) - round(nb_year_serie/2))

half_2 <- years %>%
  dplyr::filter(year_published >= max(years$year_published) - round(nb_year_serie/2))

# Calculate increase
((sum(half_2$n)-sum(half_1$n))/sum(half_1$n)) # 17.69

# Stacked chart: Number of studies per type of zoonotic agent and per year (all zoonotic agents studied, found or not)
years_typ_agent_st <- table_final %>%
  dplyr::select (ref_id, year_published, superkingdom_pathog) %>%
  dplyr::mutate(superkingdom_pathog = case_when(is.na(superkingdom_pathog) == TRUE ~ "Other",
                                                TRUE ~ superkingdom_pathog)) %>%
  distinct() %>% #remove duplicated ID from the dataset
  dplyr::select(-ref_id) %>%
  dplyr::group_by(year_published, superkingdom_pathog) %>%
  dplyr::summarize(nb_studies = n())

# Stacked
stacked_year <- ggplot(years_typ_agent_st, aes(fill=superkingdom_pathog, y=nb_studies, x=year_published)) + 
  geom_bar(position="stack", stat="identity", color="white", linewidth=0.3, alpha = 0.9) +
  scale_fill_manual(values = c("#66C2A5","#FC8D62", "#2166AC", "#E78AC3"), name = "Superkingdom\nzoonotic agent") +
  xlab("Year of publication") +
  ylab ("Number of studies") + 
  theme_bw() +
  theme(text = element_text(size = 8),
        axis.title = element_text(size = 7),
        legend.text = element_text(size=7),
        legend.title = element_text(size=8),
        legend.key.size = unit(3, 'mm')) 
stacked_year 
# Export this chart
ggsave("Supp_Fig2.svg", stacked_year , width = 10, height = 8, units="cm") 

# Put these numbers into global context
nb_year_serie_pub <- max(PubMed_Timeline_Results_by_Year$Year)-min(PubMed_Timeline_Results_by_Year$Year)
half_1_pub <- PubMed_Timeline_Results_by_Year %>%
  dplyr::filter(Year <  max(PubMed_Timeline_Results_by_Year$Year)-round(nb_year_serie_pub/2))
half_2_pub <- PubMed_Timeline_Results_by_Year %>%
  dplyr::filter(Year >=  max(PubMed_Timeline_Results_by_Year$Year)-round(nb_year_serie_pub/2))
### increase
((sum(half_2_pub$Count)-sum(half_1_pub$Count))/sum(half_1_pub$Count)) # 17.98

#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~##~#~#~#~#~#~#~#~#~
#~#~#~#~#~#~#~#~#~#~# Locations of the studies #~#~#~#~#~#~#~#~#~#~
#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~##~#~#~#~#~#~#~#~#~
#
# use all data --> all investigated place (positive AND negative)
studies_loc <- table_final %>% 
  distinct(ref_id, location) %>%
  dplyr::filter(!is.na(location)) %>% 
  dplyr::mutate(location = strsplit(as.character(location), ",")) %>%
  unnest(location) %>%
  dplyr::mutate(across(where(is.character), str_trim)) %>% # remove trailing spaces
  dplyr::mutate(location = case_when(location == "B" ~ "Burgenland",
                                     location == "UA" ~ "Upper Austria",
                                     location == "LA" ~ "Lower Austria",
                                     location == "VIE" ~ "Vienna",
                                     location == "CA" ~ "Carinthia",
                                     location == "SA" ~ "Salzburg",
                                     location == "ST" ~ "Styria",
                                     location == "TY" ~ "Tyrol",
                                     location == "VO" ~ "Vorarlberg",
                                     location == "AUT" ~ "Austria", # national studies
                                     TRUE ~ location)
  ) %>%
  dplyr::group_by(location) %>%
  dplyr::summarize(nb_studies = n()) %>%
  dplyr::mutate(percent = round(nb_studies/sum(nb_studies)*100,1))
studies_loc 

# Stacked chart: Number of studies per type of zoonotic agent and per region (all zoonotic agents studied, found or not)
region_typ_agent_st <- table_final %>%
  dplyr::filter(!is.na(location)) %>% 
  dplyr::mutate(location = strsplit(as.character(location), ",")) %>%
  unnest(location) %>%
  dplyr::mutate(across(where(is.character), str_trim)) %>% # remove trailing spaces
  dplyr::mutate(location = case_when(location == "B" ~ "Burgenland",
                                     location == "UA" ~ "Upper Austria",
                                     location == "LA" ~ "Lower Austria",
                                     location == "VIE" ~ "Vienna",
                                     location == "CA" ~ "Carinthia",
                                     location == "SA" ~ "Salzburg",
                                     location == "ST" ~ "Styria",
                                     location == "TY" ~ "Tyrol",
                                     location == "VO" ~ "Vorarlberg",
                                     location == "AUT" ~ "Austria", # national studies
                                     TRUE ~ location)
  ) %>%
  dplyr::mutate(superkingdom_pathog = case_when(is.na(superkingdom_pathog) == TRUE ~ "Other",
                                                TRUE ~ superkingdom_pathog)) %>%
  dplyr::select (ref_id, location, superkingdom_pathog) %>%
  distinct() %>% #remove duplicated ID from the dataset
  dplyr::select(-ref_id) %>%
  dplyr::group_by(location, superkingdom_pathog) %>%
  dplyr::summarize(nb_studies = n()) %>%
  dplyr::filter(!location %in% c("Eastern Austria","northern limestone alps","south eastern Austria","south-west Austria","west Austria")) %>%# remove locations not related to a federal state (only 5 studies)
  dplyr::arrange(location, desc(superkingdom_pathog))
region_typ_agent_st

# Calculate the percentages
region_typ_agent_st <- ddply(region_typ_agent_st, .(location), transform, percent = nb_studies/sum(nb_studies) * 100)

# Format the labels and calculate their positions for plotting
region_typ_agent_st <- ddply(region_typ_agent_st, .(location), transform, pos = (cumsum(nb_studies) - 0.5 * nb_studies))
region_typ_agent_st$label = paste0(sprintf("%.0f", region_typ_agent_st$percent), "%")

# Stacked
stacked_region <- ggplot(region_typ_agent_st, aes(fill=superkingdom_pathog, y=nb_studies, x=location)) + 
  geom_bar(position="stack", stat="identity", color="white", linewidth=0.3, alpha = 0.9) +
  scale_fill_manual(values = c("#66C2A5","#FC8D62", "#E78AC3"), name = "Superkingdom zoonotic agent") + # match colors to stacked_year
  xlab("Location") +
  ylab ("Number of studies") + 
  theme_bw() +
  geom_text(aes(y = pos, label = label), size = 2.5) +
  coord_flip() +
  theme(legend.position="bottom",
        legend.key.size = unit(4, 'mm'),
        legend.text = element_text(size=7.5),
        text = element_text(size = 8),
        axis.title = element_text(size = 8))
stacked_region
# Export this histogram
ggsave("Supp_Fig3.svg", stacked_region , width = 16, height = 8, units="cm") 

#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~##~#~#~#~#~#~#~#~#~#~#~#~#~#~#
#~#~#~#~#~#~#~#~#~#~ Describe what has been investigated #~#~#~#~#~#~#~#~#~#~#
#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~##~#~#~#~#~#~#~#~#~#~#~#~#~#~#

# Total number of unique zoonotic agents investigated (different from retrieved)
nb.agent.investig <- length(unique(table_final$ncbi_pathog))
nb.agent.investig # 227
sort(unique(table_final$ncbi_pathog))

## Number of host investigated
length(unique(table_final$ncbi_host_sci)) # 222 but there´s one NA

# Host species investigated (positive and negative) 
host.investig_type <-  table_final %>%
  dplyr::filter(!is.na(ncbi_host_sci)) %>%  # only those with a NCBI-resolved taxonomy
  dplyr::select(ncbi_host_sci, type_host) %>%
  distinct() %>%
  dplyr::group_by(type_host) %>%
  dplyr::summarize(nb_type = n()) %>%
  dplyr::mutate(percent = round(nb_type /sum(nb_type) *100,1))
host.investig_type

# Rank zoonotic agents by the number of times they have been studied - use genus zoonotic agents
agent_studied_rank_genus <- table_final %>% ## all agents investigated
  dplyr::filter(!is.na(pathogen_1)) %>%
  mutate(genus_pathog = case_when(genus_pathog == "unknown" ~ ncbi_pathog, # replace missing genus by best host description we have
                                  TRUE ~ genus_pathog)) %>%
  dplyr::filter(!is.na(genus_pathog)) %>%
  dplyr::distinct(ref_id, genus_pathog) %>% 
  dplyr::group_by(genus_pathog) %>% 
  dplyr::summarize(nb_studies = n()) %>%
  arrange(-nb_studies ) %>%
  dplyr::mutate(percent = round(nb_studies/sum(nb_studies)*100,1))
agent_studied_rank_genus
# Export table
write.csv(agent_studied_rank_genus, "agent_studied_rank_genus.csv", row.names =  FALSE)

# Total number of unique vectors investigated (positive and negative)
vector.investig.df <- table_final %>%
  dplyr::filter(!is.na(sci_name_vector_1)) 
nb.vect.investigated <- length(unique(vector.investig.df$ncbi_vector))
nb.vect.investigated #21 (count only those with resolved taxonomy)
# nb studies per type of vector
type_vector_study <- vector.investig.df %>%
  dplyr::select(ref_id, type_vector) %>%
  distinct() %>%
  dplyr::group_by(type_vector) %>%
  dplyr::summarize(nb_studies = n()) %>%
  dplyr::mutate(percent = round(nb_studies/sum(nb_studies)*100,1))
type_vector_study

# Total number of environmental media investigated (positive AND negative)
env.investig.df <- table_final %>%
  dplyr::filter(!is.na(type_env)) %>%
  mutate(type_env = case_when(type_env == "game butchery, slaughter knive" ~ "slaughter knife",
                              type_env == "food processing plant food preperation area" | type_env =="food processing plant drain water" | type_env =="food processing plant food conveyor belt" ~ "food processing plant",
                              type_env == "river water" | type_env == "surface water" ~ "water",
                              TRUE ~ type_env))

env.media.investigated <- length(unique(env.investig.df$type_env))
env.media.investigated  #8


# Total number of food matrices investigated
food.investig.df <- table_final %>%
  dplyr::filter(!is.na(type_food_1)) %>%
  mutate(type_food_2 = case_when(type_food_2 == "meat product" | type_food_2 == "meat and meat product" | type_food_2 == "minced meat" | type_food_2 == "ground meat" | type_food_2 == "smoked bacon" ~ "meat", # replace description of the product to homogeneize
                                 type_food_2 == "dairy"| type_food_2 == "milk and milk product" | type_food_2 == "milk" | type_food_2 == "raw milk" | 
                                   type_food_2 == "raw cheese" | type_food_2 == "cheese"  ~ "dairy product", # replace description of the product to homogeneize
                                 type_food_2 == "raw cheese"  | type_food_2 == "brie" ~  "cheese",
                                 type_food_2 == "döner kebab" | type_food_2 == "kebab-skewer"  ~ "kebab",
                                 TRUE ~ type_food_2))

# food origin
food_origin_invest <- food.investig.df %>%
  dplyr::select(ref_id, type_food_1) %>%
  distinct() %>%
  dplyr::group_by(type_food_1)%>%
  dplyr::summarize(nb_studies = n()) %>%
  dplyr::mutate(percent = round(nb_studies/sum(nb_studies)*100,1)) %>%# percentage of ALL studies
  dplyr::arrange(-percent )
food_origin_invest

nb_study_food <- length(unique(food.investig.df$ref_id))#30
nb_source_food <- length(unique(food.investig.df$type_food_1))#23
nb_study_animal_food <- food.investig.df %>%
  dplyr::filter(type_food_1 != "plant-based food")
nb_study_food <- length(unique(nb_study_animal_food$ref_id)) #30

#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~##~#~#~#~#~#~#~#~#~#~#~#~#~#~#
#~#~#~#~#~#~#~#~#~#~#~#~#~ Research trends #~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~# 
#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~##~#~#~#~#~#~#~#~#~#~#~#~#~#~#

# plant-based food are included into the environment compartment
# vectors are included into the environment compartment

# format data hosts
host_yr <- table_final %>% # all investigated
  dplyr::filter (is.na (name_host) == FALSE)  %>%
  mutate(type = case_when(type_host == "wildlife" | type_host == "companion animal" |type_host == "livestock" ~ "animal",
                          TRUE ~ type_host)) %>%
  #dplyr::distinct(ref_id,year_published, type) %>% # add this line if we investigate number of publications
  dplyr::group_by (year_published, type) %>%
  dplyr::summarize(n = n()) %>% 
  relocate(type, .after=n)
host_yr

# format data vectors
vector_yr <-table_final %>% # all investigated
  dplyr::filter (!is.na (type_vector))  %>%
  dplyr::select(year_published, type_vector) %>%
  group_by (year_published) %>%
  dplyr::summarize(n = n())
vector_yr$type <- rep("environment", nrow(vector_yr))
vector_yr

# format data environment
envir_yr <- table_final %>% # all investigated
  dplyr::filter (!is.na (type_env))  %>%
  dplyr::select(year_published, type_env) %>%
  mutate(type_env = case_when(type_env == "game butchery, slaughter knive" ~ "slaughter knife",
                              type_env == "food processing plant food preperation area" | type_env =="food processing plant drain water" | type_env =="food processing plant food conveyor belt" ~ "food processing plant",
                              type_env == "river water" | type_env == "surface water" | type_env == "water 1500m asl" ~ "water",
                              type_env == "soil 1500m asl"  ~ "soil",
                              TRUE ~ type_env)) %>%
  group_by (year_published) %>%
  dplyr::summarize(n = n())
envir_yr$type <- rep("environment", nrow(envir_yr))
envir_yr 

# format data food
food_yr <- table_final %>% # all investigated
  dplyr::filter(!is.na(type_food_1))  %>%  
  mutate (type = case_when (type_food_1 == "plant-based food" ~ "environment", # plant-based food are included into environment compartment
                            type_food_1 != "plant-based food" ~ "animal"))%>%  # food of animal origin are included into animal compartment
  dplyr::group_by (year_published,type) %>%
  dplyr::summarize(n = n()) %>%
  relocate(type, .after = n)
food_yr 

# bind the datasets
publi_type_yr <- rbind(host_yr, vector_yr, envir_yr, food_yr)
colnames(publi_type_yr) <- c("Year", "n", "Compartment")

trend_compartment <- publi_type_yr  %>%
  ggplot( aes(x=Year , y=n, group=Compartment, color=Compartment)) +
  geom_point(size = 0.5) +
  ylab("Interest (number of investigations)")+
  theme_bw() +
  geom_ribbon(aes(fill = Compartment,alpha = 0.6), stat = "smooth", method = "loess",  color = NA)+
  scale_fill_manual(values= c("dodgerblue3","#33CC00", "goldenrod")) +
  stat_smooth(
    aes(color = Compartment, fill = Compartment),
    method = "loess", se = F)+ 
  scale_colour_manual(values= c("dodgerblue3","#33CC00", "goldenrod"))+
  facet_wrap(~Compartment) + 
  theme(legend.position="none") +
  ggtitle("b")
trend_compartment

## Graphs trends
# Trend in the zoonotic agents STUDIED --> number of time a zoonotic agent was investigated
# Will show the intensity of the scientific interest per type of zoonotic agent

agent_yr <- table_final %>%
  dplyr::filter (is.na(pathogen_1) == FALSE)  %>%
  #dplyr::distinct(ref_id,year_published, type) %>% # add this line if we investigate number of publications
  dplyr::group_by (year_published, superkingdom_pathog) %>%
  dplyr::summarize(n = n()) 
agent_yr

trend_agent_typ <- agent_yr  %>%
  dplyr::filter(!is.na(superkingdom_pathog)) %>% 
  ggplot( aes(x=year_published , y=n, group=superkingdom_pathog, color=superkingdom_pathog)) +
  geom_point(size = 0.5) +
  ylab("Interest (number of investigations)")+
  theme_bw() +
  geom_ribbon(aes(fill = superkingdom_pathog,alpha = 0.6), stat = "smooth", method = "loess",  color = NA)+
  scale_fill_manual(values= c("#66C2A5","#FC8D62", "#E78AC3")) +
  stat_smooth(
    aes(color = superkingdom_pathog, fill = superkingdom_pathog),
    method = "loess", se = F)+ 
  scale_colour_manual(values= c("#66C2A5","#FC8D62", "#E78AC3")) +
      facet_wrap(~superkingdom_pathog) + 
  theme(legend.position="none") +
  xlab("Year")+
  ggtitle("a")
trend_agent_typ

## Combine graphs
figure_trends <- egg::ggarrange(trend_agent_typ, trend_compartment,
                                ncol = 1, nrow = 2)
ggsave("Figure1.svg", figure_trends, width =15, height = 15, units="cm")


#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~
############## Zoonotic web actors and interfaces ##################
#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~

# Total number of unique zoonotic agents found (positive compartment --> positive vector or host or environment)
agent.found.df <- table_final %>%
  filter (prev_vector != "negative" | prev_anim_host != "negative" | prev_env != "negative" | prev_food != "negative" ) %>%  # filter zoonotic agents which prevalence is zero for all reported prevalences 
  filter (is.na(prev_vector) == FALSE | is.na(prev_anim_host) == FALSE | is.na(prev_env) == FALSE  | is.na(prev_food) == FALSE)  # filter NA value for all reported prevalences

nb.agent.found <- length(unique(agent.found.df$ncbi_pathog))
nb.agent.found #197

#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#
#~#~#~#~# Dendrogram of the zoonotic agents found in Austria #~#~#~#~#
#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#

agent_dend.df <- agent.found.df %>%
  dplyr::mutate(ncbi_pathog = case_when(pathogen_1 == "dog hair" ~ "dog hair",
                                        TRUE ~ ncbi_pathog )) %>% 
  dplyr::filter(!is.na(ncbi_pathog)) %>% 
  dplyr::group_by(superkingdom_pathog,  ncbi_pathog) %>%
  dplyr::mutate(superkingdom_pathog = case_when(is.na(superkingdom_pathog) == TRUE ~ "Other",
                                                TRUE ~ superkingdom_pathog)) %>%
  dplyr::summarize(value=n()) # Number of time a zoonotic agent was evidenced
colnames(agent_dend.df ) <- c("from", "to", "value")

origin <- data.frame(from="origin", to=c(unique(agent.found.df$superkingdom_pathog))) %>%
  dplyr::mutate(to = case_when(is.na(to) == TRUE ~ "Other",
                                                TRUE ~ to))
  
agent_dend.df <- rbind(agent_dend.df, origin)

agent_vertices <- data.frame(name = unique(c(agent_dend.df$from, agent_dend.df$to)))

# Add a column with the group of each name. It will be useful later to color points
agent_vertices  <- agent_vertices %>%
  left_join(agent_dend.df , by = c("name" = "to")) %>%
  dplyr::mutate(from = case_when(name == "Bacteria" ~ "Bacteria",
                                 name == "Other" ~ "Other",
                                 name == "Eukaryota" ~ "Eukaryota",
                                 name == "Viruses" ~ "Viruses",
                                 TRUE ~ from)) %>%
  dplyr::rename("group" = "from")

# Add information concerning the label we are going to add: angle, horizontal adjustement and potential flip
## Calculate the ANGLE of the labels
agent_vertices$id=NA
myleaves=which(is.na( match(agent_vertices$name, agent_dend.df$from) ))
nleaves=length(myleaves)
agent_vertices$id[ myleaves ] = seq(1:nleaves)
agent_vertices$angle= 90 - 360 * agent_vertices$id / nleaves

# calculate the alignment of labels: right or left
# If I am on the left part of the plot, my labels have currently an angle < -90
agent_vertices$hjust<-ifelse( agent_vertices$angle < -90, 1, 0)

# flip angle BY to make them readable
agent_vertices$angle<-ifelse(agent_vertices$angle < -90, agent_vertices$angle+180, agent_vertices$angle)

# Create a graph object
graph_agent <- graph_from_data_frame( agent_dend.df , vertices=agent_vertices)

# Make the plot
dendrogram_agent <- ggraph(graph_agent, layout = 'dendrogram', circular = TRUE) + 
  geom_edge_diagonal(colour="grey") +
  geom_node_text(aes(x = x*1.15, y=y*1.15, filter = leaf, label=name, angle = angle, hjust=hjust, colour=group), size=2.7) +
  geom_node_point(aes(filter = leaf, x = x*1.07, y=y*1.07, colour=group, size=value, alpha=0.2)) + 
  scale_colour_manual(values= c("#66C2A5","#FC8D62", "#2166AC", "#E78AC3")) +
  scale_size_continuous( range = c(0.1,10) ) +
  theme_void() +
  theme(
    legend.position="none",
    plot.margin=unit(c(0,0,0,0),"cm"),
  ) +
  expand_limits(x = c(-1.7, 1.7), y = c(-1.9, 1.7))
dendrogram_agent

# Export this dendrogram
ggsave("dendrogram_agent.svg", dendrogram_agent, width = 30, height = 30, units="cm") 

#~#~#~#~#~#  Pairs of zoonotic agent-host species - positive samples #~#~#~#~#~# 
matrix_host <- agent.found.df %>%
  dplyr::filter(!is.na(ncbi_host_sci)) %>%
  dplyr::filter(!prev_anim_host == "negative") %>%
  dplyr::mutate(ncbi_host_sci = tolower(ncbi_host_sci)) %>%    #lower case
  dplyr::group_by (ncbi_pathog, ncbi_host_sci) %>%
  dplyr::summarize(n = n())
nb_pathog_detected_in_hosts <- length(unique(matrix_host$ncbi_pathog))
nb_pathog_detected_in_hosts #187 zoonotic agents were detected in animal and human hosts
nb_host_infected <- length(unique(matrix_host$ncbi_host_sci))
nb_host_infected # 155 animal hosts were found positive

#~#~#~#~#~#  Type of host found positive (companion animals, human, wildlife...) number of species investigated and % out of the total number of species
host.found_type <- agent.found.df %>%
  dplyr::filter(!is.na(ncbi_host_sci)) %>%
  dplyr::filter(!prev_anim_host == "negative") %>%
  dplyr::select(ncbi_host_sci, type_host) %>%
  distinct() %>%
  dplyr::group_by(type_host) %>%
  dplyr::summarize(nb_type = n()) %>%
  dplyr::mutate(percent = round(nb_type /sum(nb_type) *100,1))
host.found_type

# Taxonomic class of the host found
host.found_class <- agent.found.df %>%
  dplyr::filter(!is.na(ncbi_host_sci)) %>%
  dplyr::filter(!prev_anim_host == "negative") %>%
  dplyr::select(ncbi_host_sci, class_host) %>%
  distinct() %>%
  dplyr::group_by(class_host) %>%
  dplyr::summarize(nb_species = n()) %>%
  dplyr::mutate(percent = round(nb_species /sum(nb_species) *100,1))
host.found_class

# Number of animal orders investigated: 
length(unique(table_final$order_host)) #31
# Number of animal orders carrying zoonotic agents
length(unique(agent.found.df$order_host)) #27

# List of hosts
list_hosts <- agent.found.df %>%
  dplyr::filter(!is.na(ncbi_host_sci)) %>%
  dplyr::filter(!prev_anim_host == "negative") %>%
  dplyr::select(class_host, order_host, ncbi_host_sci, ncbi_host_comm, type_host) %>%
  dplyr::distinct() %>%
  dplyr::arrange(class_host, order_host, ncbi_host_sci)
list_hosts

write.csv(list_hosts, "list_hosts.csv", row.names = FALSE)

# Bubble chart zoonotic hosts-agents (positive samples) 
agent_host_order <- agent.found.df %>%  # ONLY POSITIVE SAMPLES
  dplyr::mutate(ncbi_pathog = case_when(pathogen_1 == "dog hair" ~ "dog hair",
                                        TRUE ~ ncbi_pathog )) %>% 
  dplyr::filter(!is.na(ncbi_host_sci)) %>%
  dplyr::filter(!prev_anim_host == "negative") %>%
  dplyr::mutate(order_host = case_when(is.na(order_host) == TRUE ~ ncbi_host_sci, # replace missing order by best host description we have
                                       TRUE ~ order_host)) %>% 
  dplyr::mutate(genus_pathog = case_when(is.na(genus_pathog) == TRUE ~ ncbi_pathog, # replace missing genus by best zoonotic agent description we have
                                         TRUE ~ genus_pathog)) %>%
  dplyr::mutate(genus_pathog = case_when(genus_pathog == "unknown" ~ ncbi_pathog, # replace missing genus by best zoonotic agent description we have
                                         TRUE ~ genus_pathog)) %>%
  dplyr::select (order_host, genus_pathog, ncbi_pathog) %>%
  dplyr::distinct() %>%
  dplyr::group_by(order_host, genus_pathog) %>%
  dplyr::summarise(value= n()) # number of zoonotic agents for each genus / host order

type <- agent.found.df %>%
  dplyr::mutate(ncbi_pathog = case_when(pathogen_1 == "dog hair" ~ "dog hair",
                                        TRUE ~ ncbi_pathog )) %>% 
  dplyr::filter(!is.na(ncbi_host_sci)) %>%
  dplyr::filter(!prev_anim_host == "negative") %>%
  dplyr::filter(!is.na(pathogen_1)) %>%
  dplyr::filter(!is.na(family_host)) %>%
  dplyr::mutate(order_host = case_when(is.na(order_host) == TRUE ~ ncbi_host_sci, # replace missing order by best host description we have
                                       TRUE ~ order_host)) %>% 
  dplyr::mutate(genus_pathog = case_when(is.na(genus_pathog) == TRUE ~ ncbi_pathog, # replace missing genus by best zoonotic agent description we have
                                         TRUE ~ genus_pathog)) %>%
  dplyr::mutate(genus_pathog = case_when(genus_pathog == "unknown" ~ ncbi_pathog, # replace missing genus by best zoonotic agent description we have
                                         TRUE ~ genus_pathog)) %>%
    dplyr::mutate(superkingdom_pathog = case_when(is.na(superkingdom_pathog) == TRUE ~ "Other", # replace missing genus by best zoonotic agent description we have
                                        TRUE ~ superkingdom_pathog)) %>%
  dplyr::select (genus_pathog, superkingdom_pathog) %>%
  distinct() %>%
  na.omit()

bubble_order <- left_join(agent_host_order, type, by ="genus_pathog" , relationship = "many-to-many")
colnames(bubble_order) <- c("Host", "Pathogen", "value", "Superkingdom")

plot_bubble_order_agents <- bubble_order %>%
  arrange(desc(value)) %>%
  ggplot( aes(x=Host, y=Pathogen, size = value, color= Superkingdom)) +
  geom_point(alpha=0.7) +
  scale_size(range = c(2, 20), name="Number of zoonotic agents") +
  scale_colour_manual(values= c("#66C2A5","#FC8D62", "#2166AC", "#E78AC3")) +
  theme_ipsum() +
  theme(legend.position="right") +
  ylab("Zoonotic agent genus") +
  xlab("Host taxonomic family") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 10))+
  theme(axis.text.y = element_text(size = 10),
        axis.title.x = element_text(size = 15),
        axis.title.y = element_text(size = 15),
        legend.text = element_text(size = 15),
        legend.title = element_text(size = 15)) +
  guides(color = guide_legend(override.aes = list(size = 10))) + 
  theme(legend.justification = "top") +
  ggtitle("")
ggsave("Figure2a.svg", plot_bubble_order_agents, width =35, height = 35, units="cm")

#~#~#~#~# Number of zoonotic agents per host order #~#~#~#~#
zoon_rich_order <- agent.found.df %>%  # ONLY POSITIVE SAMPLES
  dplyr::filter(!prev_anim_host == "negative") %>%
  dplyr::mutate(ncbi_pathog = case_when(pathogen_1 == "dog hair" ~ "dog hair",
                                        TRUE ~ ncbi_pathog )) %>% 
  dplyr::filter(!is.na(ncbi_host_sci)) %>%
  dplyr::mutate(order_host = case_when(is.na(order_host) == TRUE ~ ncbi_host_sci, # replace missing order by best host description we have
                                       TRUE ~ order_host)) %>% 
  dplyr::mutate(genus_pathog = case_when(is.na(genus_pathog) == TRUE ~ ncbi_pathog, # replace missing genus by best zoonotic agent description we have
                                         TRUE ~ genus_pathog)) %>%
  dplyr::mutate(genus_pathog = case_when(genus_pathog == "unknown" ~ ncbi_pathog, # replace missing genus by best zoonotic agent description we have
                                         TRUE ~ genus_pathog)) %>%
  dplyr::select(order_host, ncbi_pathog) %>%
  dplyr::distinct() %>%
  dplyr::group_by (order_host) %>%
  dplyr::summarise(value= n()) %>%
  arrange(value)
zoon_rich_order

# plot
zoon_rich_order_plot <- zoon_rich_order %>%
  ggplot(aes(x=fct_inorder(order_host), y=value)) + 
  geom_bar(stat = "identity", fill = "lightblue")+
  coord_flip() +
  theme_minimal() + 
  ylab ("Number of zoonotic agents") +
  xlab ("") + #Host taxonomic order
  scale_y_continuous(breaks=c(5,15, 25, 35,  45, 55,  65, 75, 85)) +
  theme(axis.text.y = element_text(size = 8),
        axis.text.x = element_text(size = 8),
        axis.title.x = element_text(size = 10, vjust = -1),
        axis.title.y = element_text(size = 10,angle=0,vjust=1))+#, margin = margin(t = 0, r = 20, b = 0, l = 0))
  ggtitle("")
zoon_rich_order_plot
ggsave("Figure2b.svg", zoon_rich_order_plot, width =15, height = 10, units="cm")

#~#~#~#~#~# VECTORS #~#~#~#~#~# 
matrix_vector <- agent.found.df %>%
  dplyr::filter (!is.na (sci_name_vector_1))  %>%
  dplyr::filter(!prev_vector == "negative") %>%
  dplyr::select(ref_id,ncbi_pathog, ncbi_vector) %>%
  distinct() %>%
  group_by (ncbi_pathog, ncbi_vector) %>%
  dplyr::summarize(n = n())
matrix_vector
nb_agent_dected_in_vector <- length(unique(matrix_vector$ncbi_pathog))
nb_agent_dected_in_vector  #24 zoonotic agents were detected in vectors
# names of the zoonotic agents
sort(unique(matrix_vector$ncbi_pathog))

# number of vectors found positive and names of the vectors
length(unique(matrix_vector$ncbi_vector)) #12 unique vector were found positive
sort(unique(matrix_vector$ncbi_vector))

# Table vectors/zoonotic agents
matrix_vector_2 <- agent.found.df %>%
  dplyr::filter (!is.na (sci_name_vector_1))  %>%
  dplyr::filter(!prev_vector == "negative") %>%
  dplyr::select(ref_id,ncbi_pathog, ncbi_vector,order_vector, family_vector) %>%
  distinct() %>%
  group_by (order_vector, family_vector, ncbi_vector, ncbi_pathog) %>%
  dplyr::summarize(n = n())
matrix_vector_2

matrix_vector_2_tab <- aggregate(ncbi_pathog~ncbi_vector+order_vector+family_vector,data=matrix_vector_2,FUN = function(x) paste0(x,collapse = '; '))
write.csv(matrix_vector_2_tab, "matrix_vector_2_tab.csv", row.names = FALSE )

#~#~#~#~#~#  ENVIRONMENT #~#~#~#~#~# 
matrix_env <- agent.found.df %>%
  dplyr::filter (!is.na (type_env))  %>%
  dplyr::filter(!prev_env == "negative") %>%
  dplyr::select(ref_id,ncbi_pathog, type_env) %>%
  distinct() %>%
  mutate(type_env = case_when(type_env == "game butchery, slaughter knive" ~ "slaughter knife",
                              type_env == "food processing plant food preperation area" | type_env =="food processing plant drain water" | type_env =="food processing plant food conveyor belt" ~ "food processing plant",
                              type_env == "river water" | type_env == "surface water" | type_env == "water 1500m asl" ~ "water",
                              type_env == "soil 1500m asl"  ~ "soil",
                              TRUE ~ type_env)) %>%
  group_by (ncbi_pathog, type_env) %>%
  dplyr::summarize(n = n())
matrix_env
nb_agent_dected_in_env <- length(unique(matrix_env$ncbi_pathog))
nb_agent_dected_in_env  #11 zoonotic agents detected in the environment

# environment types that were found positive
length(unique(matrix_env$type_env)) #6 unique env were found positive
# types:
sort(unique(matrix_env$type_env))

# Table environment/zoonotic agents
matrix_envir_2 <- agent.found.df %>%
  dplyr::filter (!is.na (type_env))  %>%
  dplyr::filter(!prev_env == "negative") %>%
  dplyr::select(ref_id,ncbi_pathog, type_env) %>%
  distinct() %>%
  mutate(type_env = case_when(type_env == "game butchery, slaughter knive" ~ "slaughter knife",
                              type_env == "food processing plant food preperation area" | type_env =="food processing plant drain water" | type_env =="food processing plant food conveyor belt" ~ "food processing plant",
                              type_env == "river water" | type_env == "surface water" | type_env == "water 1500m asl" ~ "water",
                              type_env == "soil 1500m asl"  ~ "soil",
                              TRUE ~ type_env)) %>%
  group_by (type_env, ncbi_pathog) %>%
  dplyr::summarize(n = n())
matrix_envir_2

matrix_envir_2_tab <- aggregate(ncbi_pathog~type_env,data=matrix_envir_2,FUN = function(x) paste0(x,collapse = '; '))
write.csv(matrix_envir_2_tab, "matrix_envir_2_tab.csv", row.names = FALSE )

#~#~#~#~#~# FOOD #~#~#~#~#~# 
matrix_food <- agent.found.df %>% # Only positive samples
  dplyr::filter(!is.na(type_food_1))  %>%  
  dplyr::filter(!prev_food == "negative") %>%
  dplyr::select(ref_id,ncbi_pathog, type_food_1, type_food_2) %>%
  left_join(categories_food, by = "type_food_2") %>%
  dplyr::distinct() %>%
  dplyr::select(ncbi_pathog, type_food_1, group)%>%
  unite("food", type_food_1:group, sep = " ") %>% 
  dplyr::group_by (ncbi_pathog, food) %>%
  dplyr::summarize(n = n())

nb_agent_dected_in_food <- length(unique(matrix_food$ncbi_pathog))
nb_agent_dected_in_food  #15 zoonotic agents found in food
# names of the zoonotic agents:
sort(unique(matrix_food$ncbi_pathog))

length(unique(matrix_food$food)) #31 types of food were found positive 
# list of the food (and origin) found positive
sort(unique(matrix_food$food))

# Alluvial plot: zoonotic agents in food (positive samples)
genus_agent_food <- agent.found.df %>% # Only positive samples
  filter(!is.na(ncbi_pathog)) %>%
  filter(!is.na(type_food_1))  %>%  
  dplyr::filter(!prev_food == "negative") %>%
  dplyr::select(genus_pathog,type_food_1,type_food_2) %>%
  left_join(categories_food, by = "type_food_2") %>%
  dplyr::select(-type_food_2) %>%
  dplyr::group_by(genus_pathog, type_food_1, group) %>%
  tally() # number of investigations
colnames(genus_agent_food) <- c("Genus", "Origin", "Type", "n")

alluvial_food <- ggplot(as.data.frame(genus_agent_food),
                      aes(y = n,
                          axis1 = Genus, axis2 = Origin, axis3 = Type)) +
  geom_flow() +
  guides(fill = "none") +
  geom_alluvium(aes(fill = Genus), width = 1/12) +
  geom_stratum(alpha = .5) +
  scale_x_discrete(limits = c("Genus zoonotic agent",
                              "Origin food", 
                              "Type food"), expand = c(.05, .05)) +
  scale_fill_manual(values = c('Yersinia'    = "grey75"
                               ,'Trichinella'  ="orange"
                               ,'Staphylococcus'  = "grey85"
                               ,'Salmonella'  = "red"
                               ,'Mycobacterium'  = "grey80"
                               ,'Listeria'  = "grey70"
                               ,'Escherichia'  = "grey60"
                               ,'Echinococcus'  = "purple"
                               ,'Clostridioides'  = "grey65"
                               ,'Campylobacter'  = "grey70"
                               ,'Brucella'  = "grey60"
                               ,'Anisakis'  = "grey65"
                               
  ) ) +
  theme_minimal()+
  theme(axis.text.x = element_text(size = 13)) + 
  ylab("Number of investigations") +
  geom_fit_text(stat = "stratum", aes(label = after_stat(stratum)), min.size = 3,reflow = TRUE, size = 13)
# Export
ggsave("Supp_Fig5.svg", alluvial_food , width =30, height = 20, units="cm")


food_rank <- genus_agent_food %>%
  dplyr::group_by(Genus) %>%
  dplyr::summarise(sum= sum(n))
sum(food_rank$sum) #276
perc_listeria <- 101*100/sum(food_rank$sum)
perc_salmo <- 62*100/sum(food_rank$sum)
perc_escher <- 63*100/sum(food_rank$sum)

rank_pos_results_food <- genus_agent_food %>%
  dplyr::group_by(Type) %>%
  dplyr::summarise(tot = sum(n))
perc_meat <- 154*100/sum(food_rank$sum)

rank_pos_results_origin <- genus_agent_food %>%
  dplyr::group_by(Origin) %>%
  dplyr::summarise(tot = sum(n))
perc_unspec <- 122*100/sum(food_rank$sum)
perc_plant <- 7*100/sum(food_rank$sum)

#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~
################ Imported and emerging zoonotic agents ############### 
#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~
# Imported zoonotic agents
imported <- agent.found.df %>%
  dplyr::filter(!is.na(ncbi_pathog)) %>%
  dplyr::filter(imported == "yes") %>%
  dplyr::group_by (ncbi_pathog, imported, imported_country) %>%
  dplyr::summarize(nb_evidence = n())
imported
write.csv(imported, "imported.csv", row.names=FALSE)

imported_list <- sort(unique(imported$ncbi_pathog))
length(imported_list)
sum(imported$nb_evidence)

# Emerging zoonotic agents
emerging.df <- agent.found.df %>%
  dplyr::filter(!is.na(ncbi_pathog)) %>%
  dplyr::filter(!is.na(emerging)) %>%
  dplyr::group_by(emerging, ncbi_pathog, superkingdom_pathog,ncbi_host_comm, ncbi_host_sci, ncbi_vector, year_data_collect, year_published) %>%
  dplyr::summarize(nb_evidence = n()) %>%
  dplyr::filter(ncbi_pathog!="Babesia") #remove Babesia as it is not specific enough
emerging.df
write.csv(emerging.df, "emerging.csv", row.names = FALSE)

# Graph
emergence_type_levels <- sort(unique(emerging.df$emerging))
emergence_type_colors <- c("#C00000", "#FFC000",  "#00B050", "#0070C0") 

# Make the superkingdom_pathog vector a factor using the levels we defined above
emerging.df$emerging <- factor(emerging.df$emerging , levels= emergence_type_levels, ordered=TRUE)

# Set the heights we will use for the species names
positions <- c(0.5, -0.5, 1.0, -1.0, 0.5, -1.25, 1.5, -1.5) 
# Set the directions we will use for our milestone, for example above and below.
directions <- c(1, -1) 

# Assign the positions & directions to each date from those set above.
line_pos <- data.frame(
  "date"=unique(emerging.df$year_data_collect),
  "position"=rep(positions, length.out=length(unique(emerging.df$year_data_collect))),
  "direction"=rep(directions, length.out=length(unique(emerging.df$year_data_collect))))
# Create columns with the specified positions and directions for each milestone event
emerging.df <- left_join(x=emerging.df, y=line_pos, by= c("year_data_collect" = "date"))%>%
  dplyr::mutate(date = year_data_collect) %>%
  dplyr::mutate(date  = case_when(date == "2016, 2017, 2018" ~ "2016",
                         TRUE ~ date )) %>%
  dplyr::mutate(date = case_when(is.na(year_data_collect) ~ as.character(year_published),
                                 TRUE ~ date))
emerging.df$date <- as.numeric(as.character(emerging.df$date))

# Create a one year "buffer" at the start and end of the timeline
year_date_range <- seq(min(emerging.df$date) - 1, max(emerging.df$date) + 1, by= 1)
year_format <-year_date_range 
#year_format <- sort(emerging.df$date)
year_df <- data.frame(year_date_range, year_format)

# Prepare graph formatting
text_offset <- 0.2 # offset the labels 0.2 away from scatter points
absolute_value<-(abs(emerging.df$position)) #  use the absolute value since we want to add the text_offset and increase space away from the scatter points 
text_position<- absolute_value + text_offset
emerging.df$text_position<- text_position * emerging.df$direction # keep the direction above or below for the labels to match the scatter points

# Create timeline coordinates with an x and y axis
timeline_plot<-ggplot(emerging.df,aes(x=date,y= position, col=emerging , label=ncbi_pathog)) +
  labs(col="Emergence") + # Add the label for type of emergence
  scale_color_manual(values=emergence_type_colors, labels=emergence_type_levels, drop = FALSE) + # Assigning the colors and order to the milestones
  theme_classic() + 
  geom_hline(yintercept=0,color = "black", linewidth=0.3) + # Plot a horizontal line at y=0 for the timeline
  geom_segment(data=emerging.df, aes(y=emerging.df$position,yend=0, xend=date), color='black', size=0.2) + # Plot the vertical lines for our timeline's milestone events
  geom_point(aes(y=emerging.df$position), size=3) + # plot the scatter points at the tips of the vertical lines and date
  theme(axis.line.y=element_blank(), # remove the axis since this is a horizontal timeline and postion the legend to the bottom
                                   axis.text.y=element_blank(),
                                   axis.title.x=element_blank(),
                                   axis.title.y=element_blank(),
                                   axis.ticks.y=element_blank(),
                                   axis.text.x =element_blank(),
                                   axis.ticks.x =element_blank(),
                                   axis.line.x =element_blank(),
                                   legend.position = "bottom") +
  geom_text(data=year_df, aes(x=year_date_range,y=-0.2,label=year_format, fontface="bold"),size=3, color='black') + # add the years
  geom_text(aes(y=emerging.df$text_position,label=emerging.df$ncbi_pathog, fontface="italic"),size=3.5, vjust=0.6, show.legend = FALSE) # add the labels to the timeline
# Print plot
print(timeline_plot)
# Export
ggsave("Supp_Fig8.svg", timeline_plot, width =23, height = 15, units="cm")

# Prepare list of emerging zoonotic agents for network representation (see below)
emerging_agents <- emerging.df$ncbi_pathog

#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~
###################### Zoonotic web ################################
#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~

## Use ncbi-resolved  names
# format data zoonotic hosts
select2.1.0 <- agent.found.df %>%
  dplyr::filter (is.na(ncbi_pathog) == FALSE)  %>%
  dplyr::filter (is.na (ncbi_host_sci) == FALSE)  %>%
  dplyr::filter(!prev_anim_host == "negative")  %>%
  dplyr::group_by (ncbi_pathog, ncbi_host_sci) %>%
  dplyr::summarize(n = n()) 
colnames(select2.1.0) <- c("pathogen", "matrix", "n")

# format data vectors
select3.0 <- agent.found.df %>%
  dplyr::filter (is.na(ncbi_pathog) == FALSE) %>%
  dplyr::filter (!is.na (genus_vector))  %>%
  dplyr::filter(!prev_vector == "negative")  %>%
  group_by (ncbi_pathog, genus_vector) %>% # use genus vector
  dplyr::summarize(n = n())
colnames(select3.0) <- c("pathogen", "matrix", "n")

# format data environment
select4.0 <- agent.found.df %>%
  dplyr::filter (is.na(ncbi_pathog) == FALSE)  %>%
  dplyr::filter (!is.na (type_env))  %>%
  dplyr::filter(!prev_env == "negative")  %>%
  mutate(type_env = case_when(type_env == "game butchery, slaughter knive" ~ "slaughter knife",
                              type_env == "food processing plant food preperation area" | type_env =="food processing plant drain water" | type_env =="food processing plant food conveyor belt" ~ "food processing plant",
                              type_env == "river water" | type_env == "surface water" ~ "water",
                              TRUE ~ type_env)) %>%
  group_by (ncbi_pathog, type_env) %>%
  dplyr::summarize(n = n())
colnames(select4.0) <- c("pathogen", "matrix", "n")

# format data food
agent_food.0 <- agent.found.df %>% # Only positive samples
  filter(!is.na(ncbi_pathog)) %>%
  filter(!is.na(type_food_1))  %>%  
  dplyr::filter(!prev_food== "negative")  %>%
  dplyr::select(ncbi_pathog,type_food_1,type_food_2) %>%
  left_join(categories_food, by = "type_food_2") %>%
  dplyr::select(-type_food_2)
colnames(agent_food.0) <- c("ncbi_pathog", "food_origin", "type_food")

select5.0 <-agent_food.0 %>%
  unite("food", food_origin:type_food, sep = " ")   %>% 
  dplyr::group_by (ncbi_pathog, food) %>%
  dplyr::summarize(n = n()) %>%
  mutate (food = case_when(food == "plant-based food plant based food" ~ "plant-based food",
                                                     TRUE ~ food)) 
colnames(select5.0) <- c("pathogen", "matrix", "n")

# bind the datasets
agent_mat.0 <- rbind(select2.1.0, select3.0, select4.0, select5.0)

unique.pathog3.0 <- agent_mat.0 %>%
  dplyr::select (-c(matrix,n)) %>% 
  distinct(pathogen , .keep_all = TRUE)

# type of matrix: host, vector, food, and environment
host_type.0 <- as.data.frame(cbind(select2.1.0$matrix, rep("host", length(select2.1.0$matrix)))) %>%
  distinct()
colnames(host_type.0 ) <- c("matrix", "type_matrix")  

vector_type.0 <- as.data.frame(cbind(select3.0$matrix, rep("vector", length(select3.0$matrix)))) %>%
  distinct()
colnames(vector_type.0) <- c("matrix", "type_matrix")

env_type.0 <- as.data.frame(cbind(select4.0$matrix, rep("environment", length(select4.0$matrix)))) %>%
  distinct()
colnames(env_type.0) <- c("matrix", "type_matrix")

food_type.0 <- as.data.frame(cbind(select5.0$matrix, rep("food", length(select5.0$matrix)))) %>%
  distinct()
colnames(food_type.0) <- c("matrix", "type_matrix")

type_matrix.0 <- rbind(host_type.0 , vector_type.0 , env_type.0, food_type.0)

actors3.1 <- data.frame(name=c(unique.pathog3.0$pathogen, unique(agent_mat.0$matrix)),
                        type=c(rep("zoonotic agent", length(unique.pathog3.0$pathogen)), type_matrix.0$type_matrix))

relations.1 <- data.frame(from=agent_mat.0$pathogen,
                          to=agent_mat.0$matrix,
                          weight = agent_mat.0$n)

# build the graph object
network.1 <- graph_from_data_frame(relations.1, directed = FALSE, vertices = actors3.1)

# add vertices size to network (the vertice size = node size = number of time it is related to something)
V(network.1)$degree <-igraph::degree(network.1)

pal.1 <- c("zoonotic agent" = "grey20", "host"= "#FF6666" , "vector" = "#E5BE1C", "environment" = "#33CC99", 
           "food" = "#4589FF")

l.1 <- layout_with_fr(network.1) 

# Increase spacing between nodes
l.1[, 1] <- l.1[, 1] * 1.2
l.1[, 2] <- l.1[, 2] * 1.2

network_graph.1 <- ggraph(network.1, layout = l.1) +
  geom_edge_arc(aes(edge_linewidth = NULL), edge_colour = "grey76", strength = 0.1) +
  geom_node_point(aes(fill = type, size = degree), color = "grey25", shape = 21, position = "jitter",
                  alpha = 0.75) + 
  geom_node_text(aes(label = name, filter = degree > 10 | name == "Culex"),
                 size = 4, repel = TRUE) +  
  scale_fill_manual(values = pal.1, name = "Compartment") +
  guides(fill = guide_legend(override.aes = list(size=12))) +
  scale_edge_width(range = c(0.2, 3)) +
  scale_size(range = c(1, 10), name = "Degree") +
  theme_graph()
network_graph.1

ggsave("Figure3.svg", network_graph.1, width = 35, height = 30, units="cm")

#~#~#~#~#~#~#~#~ degree distribution for each partition #~#~#~#~#~#~#~#~#
# calculate degree per zoonotic agent and mean degree for each type of agent
agent_type <- agent.found.df %>%
  dplyr::select(ncbi_pathog, superkingdom_pathog)%>%
  distinct()

plasticity_agent <- igraph::as_data_frame(network.1, what = c("edges"))  %>%
  dplyr::select(from, to) %>%
  distinct() %>%
  dplyr::group_by(from) %>%
  dplyr::summarise(nb_sources = n()) %>%
  left_join(agent_type, by = c("from" = "ncbi_pathog")) %>%
  dplyr::select(c(from, nb_sources, superkingdom_pathog))

plasticity_agent_type <- plasticity_agent %>%
  dplyr::group_by(superkingdom_pathog) %>%
  dplyr::summarise(mean(nb_sources, na.rm=TRUE))
plasticity_agent_type

distrib_deg_agents <- ggplot(plasticity_agent, aes(x = nb_sources)) + 
  geom_histogram(binwidth=1, fill = "grey20", color = "#000000", alpha = .6) +
  xlab("Degree") +
  ylab("Count")+
  ggtitle("b")+
  theme_bw()
distrib_deg_agents  

# Degree distribution zoonotic sources
agent_div_sources <- igraph::as_data_frame(network.1, what = c("edges"))  %>%
  dplyr::select(from, to) %>%
  distinct() %>%
  dplyr::group_by(to) %>%
  dplyr::summarise(nb_agents = n()) 

distrib_deg_sources <- ggplot(agent_div_sources, aes(x = nb_agents)) + 
  geom_histogram(binwidth=1, fill = "turquoise", color = "turquoise", alpha = .6) +
  xlab("Degree") +
  ylab("Count")+
  ggtitle("a")+
  theme_bw()
distrib_deg_sources 

distrib_deg <- egg::ggarrange(distrib_deg_sources, distrib_deg_agents ,
                              ncol = 2, nrow = 1)

#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~ 
#~#~#~#~#~#~#~#~ Topology zoonotic web  #~#~#~#~#~#~#~#~#~#~
#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~

is_bipartite(network.1) #TRUE

# Conventional representation
# format data
data_web_full <- left_join(agent_mat.0 , type_matrix.0, by = "matrix") %>%
  dplyr::select(-type_matrix) %>%
  pivot_wider(names_from = "matrix",
              values_from = "n") %>%
  replace(is.na(.), 0) %>%
  column_to_rownames(var = "pathogen")

# Plot
svg("Supp_Fig6.svg",width=20, height=12)
plotweb(data_web_full,text.rot=90, y.lim = c(-1.5, 2.5), col.high="#FF6666",
        bor.col.high = "darkred", col.low = "grey25",labsize=0.8, bor.col.interaction="grey70")
dev.off()

# number of nodes
S_full <- vcount(network.1)
S_full #396
# number of interactions (edges)
L_full <- ecount(network.1)
L_full #658
# average number of interactions per species
L.S_full <- L_full/S_full #1.66
L.S_full

giant_comp <- bc3net::getgcc(network.1)
components <- graph_extract_components(network.1)

## Node degree centrality bipartite graph
types_full <- V(network.1)$type ## getting each vertex `type` let's us sort easily
table(types_full)
deg_full <- round(igraph::degree(network.1),2)
cent_df_full <- data.frame(types_full, deg_full) %>%
  dplyr::arrange(desc(deg_full))
write.csv(cent_df_full, "bip.centrality.metrics_full.csv", row.names=TRUE)

#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~
############### Networks of zoonotic agent sharing  ################
#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~

## Convert to one-mode (source network)
source_net_full <- as.one.mode(data_web_full, project="higher") 
# Convert to graph object
source_net_graph_full <- graph_from_adjacency_matrix(
  source_net_full,
  mode = c("undirected"),
  weighted = TRUE
)

source_net_graph_full_df <- igraph::as_data_frame(source_net_graph_full)

#~#~#~#~#~#~#~# Adjust for scientific research effort #~#~#~#~#~#~#~#~# 
# Calculate number of citations per source
# hosts
cit_host <- table_final %>%
  dplyr::filter (!is.na (ncbi_host_sci))  %>%
  dplyr::select(ncbi_host_sci, ref_id) %>%
  dplyr::group_by (ncbi_host_sci) %>% # use genus vector
  dplyr::summarize(studies1 = n())
colnames(cit_host ) <- c("name", "studies1")

# vectors
cit_vect <- table_final %>%
  dplyr::filter (!is.na (genus_vector))  %>%
  dplyr::select(genus_vector, ref_id) %>%
  dplyr::group_by (genus_vector) %>% # use genus vector
  dplyr::summarize(studies1 = n())
colnames(cit_vect) <- c("name", "studies1")

# environment
cit_env <- table_final %>%
  dplyr::filter (!is.na (type_env))  %>%
  mutate(type_env = case_when(type_env == "game butchery, slaughter knive" ~ "slaughter knife",
                              type_env == "food processing plant food preperation area" | type_env =="food processing plant drain water" | type_env =="food processing plant food conveyor belt" ~ "food processing plant",
                              type_env == "river water" | type_env == "surface water" ~ "water",
                              TRUE ~ type_env)) %>%
  dplyr::select(type_env, ref_id) %>%
  dplyr::group_by (type_env) %>%
  dplyr::summarize(studies1 = n())
colnames(cit_env) <- c("name", "studies1")

# food
cit_food <- table_final %>% 
  filter(!is.na(type_food_1))  %>%  
  dplyr::select(type_food_1,type_food_2, ref_id) %>%
  left_join(categories_food, by = "type_food_2") %>%
  dplyr::select(-type_food_2) %>%
  dplyr::relocate(ref_id, .after = last_col()) %>%
  unite("food", type_food_1:group, sep = " ")   %>% 
  dplyr::group_by (food)  %>%  
  dplyr::summarize(studies1 = n()) %>%
  mutate (food = case_when (food == "plant-based food plant based food" ~ "plant-based food", 
                                                            TRUE ~ food))  
colnames(cit_food) <- c("name", "studies1")

# bind the datasets
cit_mat <- rbind(cit_host, cit_vect, cit_env , cit_food)

# Look at the data distribution
hist(cit_mat$studies1)
# Normalize number of citations as data are right skewed
# Box-Cox transformation using caret package 
lambda2 <- BoxCoxTrans(cit_mat$studies1)
bc_studies2 <- predict(lambda2, cit_mat$studies1) 
hist(bc_studies2)
# Add transformed data to the data frame
cit_mat$bc_studies2  <- bc_studies2

cit_mat <- cit_mat %>%
  dplyr::select(-studies1)


source_edgelist2=data.frame(edge=get.edgelist(source_net_graph_full ),weight=edge_attr(source_net_graph_full , name="weight", index = E(source_net_graph_full)))
cit1=numeric()
cit2=numeric()
lower.cit=numeric()
for(i in 1:dim(source_edgelist2)[1]){
  cit1[i]=cit_mat$bc_studies2[which(cit_mat$name==source_edgelist2$edge.1[i])]
  cit2[i]=cit_mat$bc_studies2[which(cit_mat$name==source_edgelist2$edge.2[i])]
  lower.cit[i]=min(c(cit1[i],cit2[i]))
}
source_edgelist2$cit1=cit1
source_edgelist2$cit2=cit2
source_edgelist2$lower.cit = lower.cit

# Regress the weight against the box-cox transformed number of citations of the least sampled host species
plot(source_edgelist2$lower.cit,source_edgelist2$weight)
set.seed(358)
sourceweightLM2 <- lm(source_edgelist2$weight~source_edgelist2$lower.cit-1) # where "-1" means remove the intercept
abline(sourceweightLM2, col = "red")

source_edgelist2$residuals.lowercit <- residuals(sourceweightLM2)

# Rescale the residuals so that minimum value is 1
source_edgelist2$adjusted.weights <- source_edgelist2$residuals.lowercit-min(source_edgelist2$residuals.lowercit)+1

# The residuals would reflect the amount of shared zoonotic agents relative to sampling effort,
# under the assumption that the measure of sampling effort should be from the lesser studied species. 
# It would be a relative measure of parasite sharing that is comparable across edges in the network.

## Build the new zoonotic agent sharing matrix
source_net.adjustedweights2=matrix(0,nrow=dim(source_net_full)[1],ncol=dim(source_net_full)[2])
colnames(source_net.adjustedweights2)=colnames(source_net_full)
rownames(source_net.adjustedweights2)=rownames(source_net_full)

for(i in 1:dim(source_edgelist2)[1]){
  sp1=which(colnames(source_net.adjustedweights2)==source_edgelist2$edge.1[i])
  sp2=which(colnames(source_net.adjustedweights2)==source_edgelist2$edge.2[i])
  source_net.adjustedweights2[sp1,sp2]= source_net.adjustedweights2[sp2,sp1]= source_edgelist2$adjusted.weights[i]
}

## Create sampling effort-corrected one-mode networks 
source_net_graph_adj_full <- graph_from_adjacency_matrix(
  source_net.adjustedweights2,
  mode = c("undirected"),
  weighted = TRUE
)

#remove all but the largest component
source_net_graph_adj_full2 <-remove_small_components(source_net_graph_adj_full)

## compute node centrality measures
type_ <- cent_df_full %>%
  dplyr::select(types_full) %>% 
  rownames_to_column(var="node")

str_full <- round(igraph::strength(source_net_graph_adj_full2, weights = E(source_net_graph_adj_full2)$weight),2)
deg_full <- round(igraph::degree(source_net_graph_adj_full2),2)
bet_full <- round(igraph::betweenness(source_net_graph_adj_full2,  weights = 1/E(source_net_graph_adj_full2)$weight,normalized	= TRUE),4) #
clos_full <-  round(igraph::closeness(source_net_graph_adj_full2, weights = 1/E(source_net_graph_adj_full2)$weight,normalized	= TRUE),4)
cent_onemode_full_df <- data.frame(deg_full, str_full, bet_full,  clos_full) %>%
  rownames_to_column(var="node") %>%
  left_join(type_, by = c("node"))
write.csv(cent_onemode_full_df , "cent_onemode_sci_name_df_correctedweight_full.csv", row.names=FALSE)

# Summary statistics
cent_onemode_full_df  %>% split(.$types_full) %>% map(summary)

# comparison of mean between type of zoonotic sources
kruskal.test(deg_full ~ types_full, data = cent_onemode_full_df)
pairwise.wilcox.test(cent_onemode_full_df$deg_full, cent_onemode_full_df$types_full,
                     p.adjust.method = "BH")

kruskal.test(str_full ~ types_full, data = cent_onemode_full_df)
pairwise.wilcox.test(cent_onemode_full_df$str_full, cent_onemode_full_df$types_full,
                     p.adjust.method = "BH")

kruskal.test(bet_full ~ types_full, data = cent_onemode_full_df)

kruskal.test(clos_full ~ types_full, data = cent_onemode_full_df)
pairwise.wilcox.test(cent_onemode_full_df$clos_full, cent_onemode_full_df$types_full,
                     p.adjust.method = "BH")


# Kendall correlation compares the ranking of nodes based on four centrality measures
kendall_test <- function(x,y){
  cor.test(x, y, method = "kendall",exact=FALSE)
}

degbetw <- kendall_test(x= cent_onemode_full_df$deg_full, y= cent_onemode_full_df$bet_full)
degclos <- kendall_test(x= cent_onemode_full_df$deg_full, y= cent_onemode_full_df$clos_full)
degstr <- kendall_test(x= cent_onemode_full_df$deg_full, y= cent_onemode_full_df$str_full)
betwclos <- kendall_test(x= cent_onemode_full_df$bet_full, y= cent_onemode_full_df$clos_full)
betwstr <- kendall_test(x= cent_onemode_full_df$bet_full, y= cent_onemode_full_df$str_full)
closstr <- kendall_test(x= cent_onemode_full_df$clos_full, y= cent_onemode_full_df$str_full)

#~#~#~#~#~#~#~#~#~#~#~##~#~#~#~#~#~#~#~#~#~#~##~#~#~#~#~#~#~#~#~#~#~#
#~#~#~#~#~#~#~#~#~# Visualize unipartite network #~#~#~#~#~#~#~#~#~#~
#~#~#~#~#~#~#~#~#~#~#~##~#~#~#~#~#~#~#~#~#~#~##~#~#~#~#~#~#~#~#~#~#~#

# Node attribute
type_node_source <- type_ %>%
  dplyr::filter(!types_full %in% c("zoonotic agent","host"))

host_node_adj <- as.data.frame(V(source_net_graph_adj_full2)) %>% 
  rownames_to_column(var="actor_host") %>% 
  left_join(agent.found.df, by = c("actor_host" = "ncbi_host_sci")) %>%
  dplyr::select(actor_host, class_host) %>%
  distinct() %>%
  left_join(type_node_source, by = c("actor_host" = "node"))%>%
  distinct() %>%
  dplyr::mutate(class_host = case_when(is.na(class_host) == TRUE ~ types_full,
                                       TRUE ~ class_host)) %>%
  dplyr::select(-types_full)
V(source_net_graph_adj_full2)$class <-  host_node_adj$class_host

# add vertices size to network (degree)
V(source_net_graph_adj_full2)$degree<-igraph::degree(source_net_graph_adj_full2)

# define a custom color palette
source_palette <- c("Mammalia"  = "#1A5878", "Aves"= "#AD8941","Testudinata" = "#E99093", 
                    "Lepidosauria"="#8968CD", "environment"= "#33CC00", vector = "grey20",
                    "food" = "brown3")
set.seed(538)
source_one_mode <- ggraph(source_net_graph_adj_full2, layout = "lgl") +
  geom_edge_arc(aes(edge_linewidth = weight, edge_colour = weight, alpha = 0.8),  strength = 0.1, show.legend = F) + 
  scale_edge_colour_gradient(low = "grey80",  high = "darkblue", guide = "edge_colourbar",
                             aesthetics = "edge_colour") + 
  geom_node_point(aes( size = degree, fill = class), shape = 21, position = "jitter", 
                  alpha = 0.75) + 
  geom_node_text(aes(label = name, filter = degree >= 50), 
                 size = 4, repel = TRUE) +  
  scale_fill_manual(values = source_palette, name = "Type of zoonotic source") +
  guides(fill = guide_legend(override.aes = list(size=6))) +
  scale_edge_width(range = c(0.2, 3)) +
  scale_size(range = c(1, 10), name = "Degree") + 
  theme_graph() + 
  ggtitle("a")
source_one_mode

#~#~#~#~#~#~#~#~#~#~#~##~#~#~#~#~#~#~#~#~#~#~##~#~#~#~#~#~#~#~#~#~#~#
####################### Community detection #########################
#~#~#~#~#~#~#~#~#~#~#~##~#~#~#~#~#~#~#~#~#~#~##~#~#~#~#~#~#~#~#~#~#~#

set.seed(538)
k_full=cluster_leiden(source_net_graph_adj_full2,
                      objective_function = "modularity",
                      weight = NULL,
                      resolution_parameter = 2,
                      n_iterations = 10) # customize function with parameters
k_full$nb_clusters #6

source_comm_leiden <- data.frame(k_full$names, k_full$membership) %>%
  dplyr::arrange(k_full.membership)

# export
write.csv(source_comm_leiden, "source_comm_leiden.csv", row.names = FALSE)

size_comunities_leid <- source_comm_leiden %>%
  dplyr::group_by(k_full.membership) %>%
  dplyr::count()
size_comunities_leid 

# Plot communities
V(source_net_graph_adj_full2)$Leiden <- as.factor(k_full$membership)

col_leiden <- c("#F0027F", "#377EB8", "#4DAF4A", "#BEAED4", "#FF7F00", "#FFFF33", "#A65628")

set.seed(538)
source_one_modeleiden <- ggraph(source_net_graph_adj_full2, layout = "lgl") + 
  geom_edge_arc(aes(edge_linewidth = weight, edge_colour = weight, alpha = 0.8),  strength = 0.1, show.legend = F) + 
  scale_edge_colour_gradient(low = "grey80",  high = "darkblue", guide = "edge_colourbar",
                             aesthetics = "edge_colour") +
  geom_node_point(aes( size = degree, fill = Leiden), shape = 21, position = "jitter", 
                  alpha = 0.75) + 
  geom_node_text(aes(label = name, filter = degree >= 50 ), 
                 size = 4, repel = TRUE) +  
  scale_fill_manual(values = col_leiden, name = "Leiden communities") +
  guides(fill = guide_legend(override.aes = list(size=4))) +
  scale_edge_width(range = c(0.2, 3)) +
  scale_size(range = c(1, 10), name = "Degree") + 
  theme_graph() +  
  ggtitle("b")
source_one_modeleiden

#Export
## Combine graphs
onemode_both <- egg::ggarrange(source_one_mode, source_one_modeleiden,
                                ncol = 2, nrow = 1)
ggsave("Figure4.svg", onemode_both, width = 60, height = 30, units="cm")

# Zoonotic agent composition for each community
assoc <- igraph::as_data_frame(network.1, what = c("edges"))  %>%
  dplyr::select(from, to) %>%
  distinct()

circ_comm <- source_comm_leiden %>%
  left_join(assoc, by = c("k_full.names" = "to")) %>%
  dplyr::group_by(k_full.membership, from) %>%
  dplyr::summarise(value = n())
# value = number of times a zoonotic agent is found in each community 

summary_n_comm <- circ_comm %>%
  dplyr::group_by(k_full.membership) %>%
  tally()

heatmap_community <- ggplot(circ_comm) + 
  geom_tile(aes(x =from, y = k_full.membership, fill = value),color = "white",
            lwd = 0.01,
            linetype = 1) +
  scale_fill_viridis_c(option = 'D',name = "Number of zoonotic sources", trans = "log2",  breaks = trans_breaks("log2", function(x) 2^x))  +
  xlab("") +
  ylab("Zoonotic source community") +
  theme_ipsum()+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  theme(axis.text.x = element_text(size = 7),
        axis.text.y = element_text(size = 10),
        axis.title.y = element_text(size = 10))+
  scale_y_continuous(breaks = circ_comm$k_full.membership)
heatmap_community

ggsave("Supp_Fig7.svg", heatmap_community, width = 50, height = 20, units="cm") 


#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~
############### Human-animal-environment interfaces ##################
#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~

# Finding triangles (cliques) in the network that represent the circulation of zoonotic agents at human-animal-environment interfaces

## Identifying the compartment for each nodes 
# Compartment of each nodes is defined by these data set
# Food : animal / environment
cit_food$name
name.food.animal <- cit_food %>% dplyr::filter(name!="plant-based food") %>% pull(name) # foods of animal origin
# environment : env
cit_env$name
# Host : animal / human
cit_host$name
length(cit_host$name)
name.host.animal <- cit_host %>% dplyr::filter(name!="Homo sapiens") %>% pull(name)
#vector : env
cit_vect$name

# Collect all nodes
nd.adjustedweights2 <- data.frame(names = rownames(source_net.adjustedweights2))
# And define the compartment based on the dataset above
nd.adjustedweights2$compartment <- ifelse(nd.adjustedweights2$names %in% cit_env$name,"env", # env
                                          ifelse(nd.adjustedweights2$names %in% cit_vect$name, "env", # vector
                                                 ifelse(nd.adjustedweights2$names %in% name.host.animal, "animal", #animal host
                                                        ifelse(nd.adjustedweights2$names %in% "Homo sapiens", "human", #human host
                                                               ifelse(nd.adjustedweights2$names %in% name.food.animal, "animal", # animal food
                                                                      ifelse(nd.adjustedweights2$names %in% "plant-based food", "env", #env food
                                                                             nd.adjustedweights2$names))))))

## Finding triangle cliques
cl.tri = cliques(source_net_graph_adj_full, min=3, max=3)
length(cl.tri)

# Define the compartment of the nodes in the cliques
cl.tri.comp <- NULL
for (i in 1:length(cl.tri)) {
  nd.adjustedweights2$compartment[nd.adjustedweights2$names %in% cl.tri[[i]]$name] -> c
  cl.tri.comp <- append(cl.tri.comp, list(c))
}

# Find the clique that consist of different compartments 
ls.tri <- NULL
for (i in 1:length(cl.tri.comp)) {
  table.duplicate <- table(duplicated(cl.tri.comp[[i]]))
  if(table.duplicate["FALSE"] == 3){ 
    ls.tri <- c(ls.tri, i) }
}
length(ls.tri) #153
# There are 153 triangle cliques that are composed of the three "One Health" compartments

# Filter the cliques that are composed of the three "One Health" compartments
filtered.cl.tri <- cl.tri[ls.tri]

## Secondly, we observe the weight for each pair of nodes in each clique
clique.weight <- NULL
for (c in 1:length(filtered.cl.tri)) { # for each list in the df
  
  # Find the possible pairs 
  t(combn(filtered.cl.tri[[c]],2)) -> pair.of.nodes
  
  weight <- NULL
  for (i in 1:3) { # for each pair
    # find the weight 
    source_net_graph_adj_full[pair.of.nodes[i,1],pair.of.nodes[i,2]] -> weight.i
    # collect the weight
    weight <- c(weight, weight.i)
    
  }
  # Return to the pair
  pair.of.nodes <- cbind(pair.of.nodes, weight)
  
  clique.weight[[c]] <- pair.of.nodes 
}

## Sum the weights to calculate the clique weight
sum.weight <- NULL
for (i in 1:length(clique.weight)) {
  sum.w <- sum(clique.weight[[i]][,3])
  
  sum.weight <- c(sum.weight, sum.w)
}

# Plotting
sum.weight <- data.frame(sum.weight)
summary(sum.weight)

hist.cl <- ggplot(sum.weight, aes(x = sum.weight)) + 
  geom_histogram(aes(y = ..density..), 
                 fill="mediumblue", bins = 50, alpha=0.7) +
  geom_density(color = "firebrick2", size = 0.7) +
  xlab("clique total weight")+
  geom_vline(xintercept=min(head(sum.weight[rev(order(sum.weight$sum.weight)),],6)), # the intercept is the top 6 clique weight
             colour="gray30",linetype="dashed")+
  xlab("clique total edge weight")+
  theme_classic() 

# Finding the highest six values for plotting the subgraphs
which(sum.weight$sum.weight %in% head(sum.weight[rev(order(sum.weight$sum.weight)),],6)) -> high.val

# add the compartments for color identification
V(source_net_graph_adj_full)$compartment <- nd.adjustedweights2$compartment
# define the compartment color
comps.col <- c("animal" = "dodgerblue3", "env"= "#33CC00", "human"="goldenrod")

plot.sub.g <- NULL
for (p in high.val) {
  sub.g <- induced.subgraph(graph=source_net_graph_adj_full,vids=filtered.cl.tri[[p]])
  
  p <- ggraph(sub.g, layout = "lgl") + 
    geom_edge_arc(aes(edge_width = weight, label = round(weight, 2)), 
                  edge_colour = "grey76", strength = 0.1) +
    geom_node_point(aes(fill=factor(compartment)), shape = 21, size = 7, 
                    color="black")+
    scale_fill_manual(values = comps.col) +
    geom_node_text(aes(label = name),
                   repel = TRUE, size = 4)+
    theme_void() + 
    theme(legend.position = "none")
  
  plot.sub.g <- append(plot.sub.g, list(p))
}

## Plot in one panel
group.clique.p <- plot_grid(plot.sub.g[[1]]+panel_border(color = "grey", size = .8, linetype = 1, remove = FALSE),
                            plot.sub.g[[2]]+panel_border(color = "grey", size = .8, linetype = 1, remove = FALSE),
                            plot.sub.g[[3]]+panel_border(color = "grey", size = .8, linetype = 1, remove = FALSE),
                            plot.sub.g[[4]]+panel_border(color = "grey", size = .8, linetype = 1, remove = FALSE),
                            plot.sub.g[[5]]+panel_border(color = "grey", size = .8, linetype = 1, remove = FALSE),
                            plot.sub.g[[6]]+panel_border(color = "grey", size = .8, linetype = 1, remove = FALSE)) 

Figure6 <- plot_grid(hist.cl+ panel_border(color = "white", size = 1, linetype = 1, remove = FALSE),
          group.clique.p + theme(plot.margin = unit(c(.2,.2,.2,.7), "cm")), 
          labels = c("a","b"), ncol = 1,
          rel_heights = c(1,1)) -> cliq.combined
# Export
ggsave2("Figure6.svg", Figure6 , width = 30, height = 30, units="cm" )

#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~
#################### Examples subgraphs ##############################
#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~


#~#~#~#~#~#~#~#~ Sub-graph emerging zoonotic agents #~#~#~#~#~#~#~#~#
emerg_agent <- network.1 %>% 
  igraph::neighborhood(nodes = unique(emerging_agents), order = 1) 

l.2 <- layout_with_fr(igraph::induced_subgraph(network.1, vids = unlist(emerg_agent)))
# Increase spacing between nodes
l.2[, 1] <- l.2[, 1] * 2
l.2[, 2] <- l.2[, 2] * 2

emerg_graph <- igraph::induced_subgraph(network.1, vids = unlist(emerg_agent)) %>% 
  tidygraph::as_tbl_graph() %>% 
  ggraph(layout = l.2) +
  geom_edge_arc(aes(edge_linewidth = weight), edge_colour = "grey76", strength = 0.1) +
  geom_node_point(aes(fill = type, size = degree), color = "grey45", shape = 21, position = "jitter",
                  alpha = 0.8) + 
  geom_node_text(aes(label = name, filter = type == "zoonotic agent"),
                 size = 4, fontface = "bold", nudge_x = 0.3, nudge_y = 0.4) + 
  geom_node_text(aes(label = name, filter = type != "zoonotic agent"),
                 size = 4, repel = TRUE, color = "grey20", nudge_x = -0.4, nudge_y = -0.2) +
  scale_fill_manual(values = pal.1, name = "Compartment") +
  guides(fill = guide_legend(override.aes = list(size=6))) +
  scale_edge_width(range = c(0.2, 3)) +
  scale_size(range = c(2, 10), name = "Degree in the full network") +
  theme_graph()
#emerg_graph 
ggsave("Supp_Fig9.svg", emerg_graph, width = 40, height =30, units="cm")

emerging.df <- as_long_data_frame(igraph::induced_subgraph(network.1, vids = unlist(emerg_agent)))
emerging_range <- emerging.df %>%
  dplyr::select(from_name, to_name, to_type) %>%
  dplyr::distinct() %>%
  dplyr::group_by(from_name, to_type) %>%
  dplyr::summarise(n())
emerging_range

emerging_range_summary <- emerging.df %>%
  dplyr::select(to_name, to_type) %>%
  dplyr::distinct() %>%
  dplyr::group_by(to_type) %>%
  dplyr::summarise(n())
emerging_range_summary


#~#~#~#~#~#~#~#~  Sus scrofa (w) #~#~#~#~#~#~#~#~ 
wildboard <- network.1 %>% 
  igraph::neighborhood(nodes = c("Sus scrofa (w)"), order = 2) 

wildboar_plot <- igraph::induced_subgraph(network.1, vids = unlist(wildboard)) %>% 
  tidygraph::as_tbl_graph() %>% 
  ggraph(layout = "lgl") +
  geom_edge_arc(aes(edge_linewidth = weight), edge_colour = "grey55", alpha = 0.8,  strength = 0.1, show.legend = F) +
  geom_node_point(aes(fill = type, size = degree*2), color = "grey25", shape = 21, position = "jitter",
                  alpha = 0.75) + 
  geom_node_text(aes(label = name, filter = degree >= 5 & name != "Sus scrofa (w)"),
                 size = 4, repel = TRUE) +   
  geom_node_text(aes(label = name, filter = name == "Sus scrofa (w)"), fontface = "bold",
                 size = 4, repel = TRUE) + 
  scale_fill_manual(values = pal.1, name = "Compartment") +
  guides(fill = guide_legend(override.aes = list(size=6))) +
  scale_edge_width(range = c(0.2, 3)) +
  scale_size(range = c(3, 10), name = "Degree in the full network") +
  theme_graph()
#Export
ggsave("Supp_Fig10.svg", wildboar_plot, width =30, height = 30, units="cm")


#~#~#~#~#~#~#~#~  Listeria #~#~#~#~#~#~#~#~ 
listeria <- network.1 %>% 
  igraph::neighborhood(nodes = c("Listeria", "Listeria monocytogenes"), order = 1) 

listeria_plot <- igraph::induced_subgraph(network.1, vids = unlist(listeria)) %>% 
  tidygraph::as_tbl_graph() %>% 
  ggraph(layout = "nicely") +
  geom_edge_arc(aes(edge_linewidth = weight), edge_colour = "grey55", alpha = 0.8,  strength = 0.1, show.legend = F) +
  geom_node_point(aes(fill = type, size = degree*2), color = "grey25", shape = 21, position = "jitter",
                  alpha = 0.75) + 
  geom_node_text(aes(label = name, filter = name != c("Listeria", "Listeria monocytogenes")),
                 size = 4, repel = TRUE) +   
  geom_node_text(aes(label = name, filter = name %in% c("Listeria", "Listeria monocytogenes")), fontface = "bold",
                 size = 4, repel = TRUE,  nudge_y = -0.1) + 
  scale_fill_manual(values = pal.1, name = "Compartment") +
  guides(fill = guide_legend(override.aes = list(size=6))) +
  scale_edge_width(range = c(0.2, 3)) +
  scale_size(range = c(3, 10), name = "Degree in the full network") +
  theme_graph()
#Export
ggsave("Supp_Fig11.svg", listeria_plot, width =30, height = 30, units="cm")


#~#~#~#~#~#~#~#~ EHEC, VTEC, STEC sub-system #~#~#~#~#~#~#~#~ 
E.coli <- network.1 %>% 
  igraph::neighborhood(nodes = c("Escherichia coli VTEC", "Escherichia coli EHEC", "Escherichia coli STEC"), order = 1) 

E.coli_plot <- igraph::induced_subgraph(network.1, vids = unlist(E.coli)) %>% 
  tidygraph::as_tbl_graph() %>% 
  ggraph(layout = "nicely") +
  geom_edge_arc(aes(edge_linewidth = weight), edge_colour = "grey55", alpha = 0.8,  strength = 0.1, show.legend = F) +
  geom_node_point(aes(fill = type, size = degree*2), color = "grey25", shape = 21, position = "jitter",
                  alpha = 0.75) + 
  geom_node_text(aes(label = name),
                 size = 4, repel = TRUE) +   
  scale_fill_manual(values = pal.1, name = "Compartment") +
  guides(fill = guide_legend(override.aes = list(size=6))) +
  scale_edge_width(range = c(0.2, 3)) +
  scale_size(range = c(3, 10), name = "Degree in the full network") +
  theme_graph()
#Export
ggsave("Supp_Fig12.svg", E.coli_plot, width =30, height = 30, units="cm")

#~#~#~#~#~#~#~#~ Culex sub-system #~#~#~#~#~#~#~#~ 
culex_subgraph <- induced.subgraph(network.1, 
                             V(network.1)[type %in% c("zoonotic agent", "host") | name %in% c("Culex") ])

culex <- network.1 %>% 
  igraph::neighborhood(nodes = c("Culex"), order = 2) 

igraph::induced_subgraph(network.1, vids = unlist(culex)) %>% 
  tidygraph::as_tbl_graph() %>% 
  ggraph(layout = "nicely") +
  geom_edge_link0(aes(edge_linewidth = NULL), edge_colour = "grey66") +
  geom_node_point(aes(fill = type, size = degree*2), color = "grey25", shape = 21, position = "jitter",
                  alpha = 0.75) + 
  geom_node_text(aes(label = name, filter = degree >= 2),
                 size = 5, repel = TRUE) +  
  scale_fill_manual(values = pal.1, name = "Compartment") +
  guides(fill = guide_legend(override.aes = list(size=6))) +
  scale_edge_width(range = c(0.2, 3)) +
  scale_size(range = c(1, 10), name = "Degree") +
  theme_graph()

#~#~#~#~#~#~#~#~  Ixodes sub-system #~#~#~#~#~#~#~#~ 
ixodes <- network.1 %>% 
  igraph::neighborhood(nodes = c("Ixodes"), order = 2) 

igraph::induced_subgraph(network.1, vids = unlist(ixodes)) %>% 
  tidygraph::as_tbl_graph() %>% 
  ggraph(layout = "nicely") +
  geom_edge_link0(aes(edge_linewidth = NULL), edge_colour = "grey66") +
  geom_node_point(aes(fill = type, size = degree*2), color = "grey25", shape = 21, position = "jitter",
                  alpha = 0.75) + 
  geom_node_text(aes(label = name, filter = degree >= 1),
                 size = 5, repel = TRUE) +  
  scale_fill_manual(values = pal.1, name = "Compartment") +
  guides(fill = guide_legend(override.aes = list(size=6))) +
  scale_edge_width(range = c(0.2, 3)) +
  scale_size(range = c(1, 10), name = "Degree") +
  theme_graph()


#~#~#~#~#~#~#~#~ USUV and West Nile sub-system #~#~#~#~#~#~#~#~ 
usuv_wnv <- network.1 %>% 
  igraph::neighborhood(nodes = c("Usutu virus", "West Nile virus"), order = 2) 

igraph::induced_subgraph(network.1, vids = unlist(usuv_wnv)) %>% 
  tidygraph::as_tbl_graph() %>% 
  ggraph(layout = "nicely") +
  geom_edge_link0(aes(edge_linewidth = NULL), edge_colour = "grey66") +
  geom_node_point(aes(fill = type, size = degree*2), color = "grey25", shape = 21, position = "jitter",
                  alpha = 0.75) + 
  geom_node_text(aes(label = name, filter = degree >= 1),
                 size = 5, repel = TRUE) +  
  scale_fill_manual(values = pal.1, name = "Compartment") +
  guides(fill = guide_legend(override.aes = list(size=6))) +
  scale_edge_width(range = c(0.2, 3)) +
  scale_size(range = c(1, 10), name = "Degree") +
  theme_graph()

#~#~#~#~#~#~#~#~ S. enterica sub-system #~#~#~#~#~#~#~#~ 
S_enterica <- network.1 %>% 
  igraph::neighborhood(nodes = c("Salmonella enterica"), order = 2) 

igraph::induced_subgraph(network.1, vids = unlist(S_enterica)) %>% 
  tidygraph::as_tbl_graph() %>% 
  ggraph(layout = "nicely") +
  geom_edge_link0(aes(edge_linewidth = NULL), edge_colour = "grey66") +
  geom_node_point(aes(fill = type, size = degree*2), color = "grey25", shape = 21, position = "jitter",
                  alpha = 0.75) + 
  geom_node_text(aes(label = name, filter = degree >= 1),
                 size = 5, repel = TRUE) +  
  scale_fill_manual(values = pal.1, name = "Compartment") +
  guides(fill = guide_legend(override.aes = list(size=6))) +
  scale_edge_width(range = c(0.2, 3)) +
  scale_size(range = c(1, 10), name = "Degree") +
  theme_graph()

#~#~#~#~#~#~#~#~ Gallus gallus sub-system #~#~#~#~#~#~#~#~ 
gallus <- network.1 %>% 
  igraph::neighborhood(nodes = c("Gallus gallus"), order = 2) 

igraph::induced_subgraph(network.1, vids = unlist(gallus)) %>% 
  tidygraph::as_tbl_graph() %>% 
  ggraph(layout = "nicely") +
  geom_edge_link0(aes(edge_linewidth = NULL), edge_colour = "grey66") +
  geom_node_point(aes(fill = type, size = degree*2), color = "grey25", shape = 21, position = "jitter",
                  alpha = 0.75) + 
  geom_node_text(aes(label = name, filter = degree >= 1),
                 size = 5, repel = TRUE) +  
  scale_fill_manual(values = pal.1, name = "Compartment") +
  guides(fill = guide_legend(override.aes = list(size=6))) +
  scale_edge_width(range = c(0.2, 3)) +
  scale_size(range = c(1, 10), name = "Degree") +
  theme_graph()

#~#~#~#~#~#~#~#~ Equus caballus sub-system #~#~#~#~#~#~#~#~ 
equus <- network.1 %>% 
  igraph::neighborhood(nodes = c("Equus caballus"), order = 2) 

igraph::induced_subgraph(network.1, vids = unlist(equus)) %>% 
  tidygraph::as_tbl_graph() %>% 
  ggraph(layout = "nicely") +
  geom_edge_link0(aes(edge_linewidth = NULL), edge_colour = "grey66") +
  geom_node_point(aes(fill = type, size = degree*2), color = "grey25", shape = 21, position = "jitter",
                  alpha = 0.75) + 
  geom_node_text(aes(label = name, filter = degree >= 1),
                 size = 5, repel = TRUE) +  
  scale_fill_manual(values = pal.1, name = "Compartment") +
  guides(fill = guide_legend(override.aes = list(size=6))) +
  scale_edge_width(range = c(0.2, 3)) +
  scale_size(range = c(1, 10), name = "Degree") +
  theme_graph()

